In [3]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point, Polygon
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from libpysal.weights import KNN
from esda.moran import Moran
from pointpats import PoissonPointProcess, k, Window
import hdbscan
import numpy as np
import pysal
import seaborn
pd.options.display.max_columns = None
import seaborn as sns
from astropy.stats import RipleysKEstimator
import geopandas as gpd
from shapely.geometry import Polygon, MultiPolygon
import contextily as ctx
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import entropy
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import itertools
from sklearn.metrics import silhouette_score
from geopy.distance import geodesic
import warnings
warnings.filterwarnings('ignore')
Helper Functions¶
In [4]:
def near_station(centroid, station_df, station_buffer=500, geo_col='geometry', name_col='name'):
distances = {}
for i in range(len(station_df)):
station_name = station_df[name_col].iloc[i]
station_point = station_df[geo_col].iloc[i] # Shapely Point
# geodesic expects (lat, lon)
dist_m = geodesic(
(centroid.y, centroid.x),
(station_point.y, station_point.x)
).meters
distances[station_name] = dist_m
return any(d <= station_buffer for d in distances.values()), min(distances.values()), distances
def compute_entropy(counts):
probs = counts / counts.sum()
return entropy(probs)
def compute_gini(counts):
if len(counts) == 1:
return 1
probs = counts / counts.sum()
sorted_probs = np.sort(probs)
n = len(probs)
cum = np.cumsum(sorted_probs)
gini = 1 - (2 / (n - 1)) * np.sum(cum) + (1 / n)
return gini
def classify_row(row, final_gdf_clustered, ent_thresh=0.7, entrop_col = 'entropy_norm', mode = 'sector_number'):
if mode == 'sector_number':
ent_thresh = 0.4 * np.log2(final_gdf_clustered[(final_gdf_clustered['year'] == row['year']) &
(final_gdf_clustered['scat_category'] != 'Other')]['scat_category'].nunique())
lower_thresh = 0.2 * np.log2(final_gdf_clustered[(final_gdf_clustered['year'] == row['year']) &
(final_gdf_clustered['scat_category'] != 'Other')]['scat_category'].nunique())
elif mode == 'entropy_quantile':
ent_thresh = final_gdf_clustered[final_gdf_clustered['year'] == row['year']][entrop_col].quantile(ent_thresh)
lower_thresh = final_gdf_clustered[final_gdf_clustered['year'] == row['year']][entrop_col].quantile(1 - ent_thresh)
elif mode == 'entropy_median':
ent_thresh = final_gdf_clustered[final_gdf_clustered['year'] == row['year']][entrop_col].median()
lower_thresh = ent_thresh
else:
lower_thresh = 1 - ent_thresh
if row[entrop_col] >= ent_thresh:
return 'High diversity'
elif row[entrop_col] < lower_thresh:
return 'Homogeneous'
else:
return 'Moderate diversity'
def groupby_multipoly(df, by, aggfunc="first"):
data = df.drop(labels=df.geometry.name, axis=1)
aggregated_data = data.groupby(by=by).agg(aggfunc)
# Process spatial component
def merge_geometries(block):
return MultiPolygon(block.values)
g = df.groupby(by=by, group_keys=False)[df.geometry.name].agg(
merge_geometries
)
aggregated_geometry = gpd.GeoDataFrame(g, geometry=df.geometry.name, crs=df.crs).reset_index()
return aggregated_geometry
def load_data(path, year_col='year', lat_col='latitude', lon_col='longitude'):
df = pd.read_pickle(path)
df['geometry'] = df.apply(lambda row: Point(row[lon_col], row[lat_col]), axis=1)
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')
return gdf.to_crs('EPSG:27700')
def compute_morans_i(gdf, attribute, k = 8):
if not k:
k = 8
k = k if len(gdf) > k else len(gdf) - 1
w = KNN.from_dataframe(gdf, k=k)
w.transform = 'R'
moran = Moran(gdf[attribute], w)
return moran.I, moran.p_sim
def compute_ripleys_k(gdf, max_dist=100, mode='none', draw_plot = False):
coords = np.array([(geom.x, geom.y) for geom in gdf.geometry])
bounds = gdf.total_bounds
area = (bounds[2] - bounds[0]) * (bounds[3] - bounds[1])
Kest = RipleysKEstimator(area=area, x_max=bounds[2], y_max=bounds[3], x_min=bounds[0], y_min=bounds[1])
r = np.linspace(0, int((area)**0.5), max_dist)
reply_data = Kest(data=coords, radii=r, mode = mode)
poisson_data = Kest.poisson(r)
reply_df = pd.concat([pd.DataFrame({'radius' : r, 'reply' : poisson_data, 'type' : ['Poisson'] * len(poisson_data)}),
pd.DataFrame({'radius' : r, 'reply' : reply_data, 'type' : ['Reply_K'] * len(reply_data)})],
ignore_index = True)
if draw_plot:
sns.lineplot(reply_df, x = 'radius', y = 'reply', hue = 'type')
return reply_df
def apply_dbscan(gdf, eps=300, min_samples=5):
coords = gdf.geometry.apply(lambda p: (p.x, p.y)).tolist()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(coords)
db = DBSCAN(eps=eps, min_samples=min_samples).fit(X_scaled)
gdf['cluster'] = db.labels_
return gdf
def apply_hdbscan(gdf, metric='euclidean'):
min_cluster_sizes = [15, 25, 35, 50, 60, 80, 100]
epsilons = [0, 50, 100, 150, 200]
best_score = -1
best_params = None
results = []
coords = gdf.geometry.apply(lambda p: (p.x, p.y)).tolist()
for mcs, eps in itertools.product(min_cluster_sizes, epsilons):
clusterer = hdbscan.HDBSCAN(min_cluster_size = mcs, metric = metric, cluster_selection_epsilon = eps)
labels = clusterer.fit_predict(coords)
if np.all(labels == -1):
continue
if len(set(labels)) > 1:
sil_score = silhouette_score(coords, labels)
else:
sil_score = -1
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
noise_pct = np.sum(labels == -1) / len(labels) * 100
results.append({
'min_cluster_size': mcs,
'epsilon_m': eps,
'silhouette': sil_score,
'n_clusters': n_clusters,
'noise_pct': noise_pct
})
if sil_score > best_score:
best_score = sil_score
best_params = (mcs, eps)
print(f"After trying the ranges for min and epis, the best values are {best_params} with score is {best_score}")
clusterer = hdbscan.HDBSCAN(min_cluster_size = best_params[0], metric = metric, cluster_selection_epsilon = best_params[1])
gdf['cluster'] = clusterer.fit_predict(coords)
return gdf, pd.DataFrame(results)
def gini_index(series):
proportions = series.value_counts(normalize=True)
return 1 - (proportions ** 2).sum()
def analyze_over_time(gdf_all, time_col='year', attribute='employee_count', industry = None, industry_col = 'scat_category',
min_cluster_size = None, min_cluster_percent = 0.02, min_cluster_size_limit = 50, k_morans_i = 8,
reply_draw_plot = False, cluster_draw_plot = False, metric='euclidean'):
if industry:
gdf_all_tmp = gdf_all[gdf_all[industry_col] == industry].copy()
title = f"Business Clusters - {industry}"
else:
gdf_all_tmp = gdf_all.copy()
title = f"Business Clusters"
years = sorted(gdf_all_tmp[time_col].unique())
results = []
cluster_sil_scores = []
for year in years:
gdf_year = gdf_all_tmp[gdf_all_tmp[time_col] == year].copy()
print(gdf_year.shape)
gdf_year_oa = gdf_year.groupby(['oa21cd', 'oa_lat', 'oa_long']).count()[['CompanyID']].reset_index().rename(
columns = {'CompanyID' : 'bus_count'})
print(gdf_year_oa.shape)
gdf_year_oa['geometry'] = gdf_year_oa.apply(lambda row: Point(row['oa_long'], row['oa_lat']), axis=1)
gdf_year_oa = gpd.GeoDataFrame(gdf_year_oa, geometry='geometry', crs='EPSG:4326')
gdf_year_oa = gdf_year_oa.to_crs('EPSG:27700')
moran_i, p = compute_morans_i(gdf_year_oa, 'bus_count', k = k_morans_i)
reply_df = compute_ripleys_k(gdf_year, draw_plot = reply_draw_plot)
gdf_clustered, sil_score = apply_hdbscan(gdf_year, metric = metric)
sil_score['year'] = year
cluster_sil_scores.append(sil_score)
cluster_trends = gdf_clustered.groupby(['year', 'cluster']).size().reset_index(name='count')
cluster_trends_pivot = cluster_trends.pivot(index='year', columns='cluster', values='count').fillna(0)
sector_dist = gdf_clustered.groupby(['year', 'cluster', 'scat_category']).size().reset_index(name='count')
sector_dist['proportion'] = sector_dist.groupby(['year', 'cluster'])['count'].transform(lambda x: x / x.sum())
dominant_sectors = sector_dist.sort_values(['year', 'cluster', 'proportion'], ascending=[True, True, False]) \
.groupby(['year', 'cluster']).first().reset_index()
print(f"\nYear: {year}")
print(f"Moran's I: {moran_i:.4f}, p-value: {p:.4f}")
print(f"Clusters found: {gdf_clustered['cluster'].nunique() - (1 if -1 in gdf_clustered['cluster'].unique() else 0)}")
results.append({
'year': year,
'moran_i': moran_i,
'moran_p': p,
'num_clusters': gdf_clustered['cluster'].nunique() - (1 if -1 in gdf_clustered['cluster'].unique() else 0),
'clustered_gdf': gdf_clustered,
'reply_df': reply_df,
'cluster_trends' : cluster_trends,
'cluster_trends_pivot' : cluster_trends_pivot,
'sector_dist' : sector_dist,
'dominant_sectors' : dominant_sectors
})
if cluster_draw_plot:
fig, ax = plt.subplots(figsize=(6,6))
gdf_clustered.plot(column='cluster', cmap='tab20', legend=True, ax=ax)
plt.title(title + f" - {year}")
plt.show()
cluster_sil_scores = pd.concat(cluster_sil_scores, ignore_index = True)
final_gdf_clustered = pd.concat([c['clustered_gdf'] for c in results], ignore_index = True)
final_gdf_clustered = final_gdf_clustered.sort_values(['CompanyID', 'year'])
final_gdf_clustered['prev_cluster'] = final_gdf_clustered.groupby('CompanyID')['cluster'].shift(1)
final_gdf_clustered['cluster_change'] = final_gdf_clustered['cluster'] != final_gdf_clustered['prev_cluster']
transitions = final_gdf_clustered.dropna(subset=['prev_cluster']) \
.groupby(['prev_cluster', 'cluster']).size().reset_index(name='count')
cluster_trends = final_gdf_clustered.groupby(['year', 'cluster']).size().reset_index(name='count')
cluster_trends['proportion'] = cluster_trends.groupby(['year'])['count'].transform(lambda x: x / x.sum())
cluster_trends_pivot = cluster_trends.pivot(index='year', columns='cluster', values='count').fillna(0)
cluster_proportion_trends_pivot = cluster_trends.pivot(index='year', columns='cluster', values='proportion').fillna(0)
sector_dist = final_gdf_clustered.groupby(['year', 'cluster', 'scat_category']).size().reset_index(name='count')
sector_dist['proportion'] = sector_dist.groupby(['year', 'cluster'])['count'].transform(lambda x: x / x.sum())
dominant_sectors = sector_dist.sort_values(['year', 'cluster', 'proportion'], ascending=[True, True, False]) \
.groupby(['year', 'cluster']).first().reset_index()
access_dist = final_gdf_clustered.groupby(['year', 'cluster', 'PTAL2023_main']).size().reset_index(name='count')
access_dist['proportion'] = access_dist.groupby(['year', 'cluster'])['count'].transform(lambda x: x / x.sum())
dominant_access = access_dist.sort_values(['year', 'cluster', 'proportion'], ascending=[True, True, False]) \
.groupby(['year', 'cluster']).first().reset_index()
gini_by_cluster = final_gdf_clustered.groupby(['year', 'cluster'])['scat_category'].apply(gini_index).reset_index()
gini_by_cluster.columns = ['year', 'cluster', 'gini_index']
access_gini_by_cluster = final_gdf_clustered.groupby(['year', 'cluster'])['PTAL2023_main'].apply(gini_index).reset_index()
access_gini_by_cluster.columns = ['year', 'cluster', 'gini_index']
return results, cluster_sil_scores, final_gdf_clustered, transitions, cluster_trends, cluster_trends_pivot, cluster_proportion_trends_pivot, \
sector_dist, dominant_sectors, access_dist, dominant_access, gini_by_cluster, access_gini_by_cluster
Load Business Dataset, VNEB, OKR and OAs Bounderies¶
In [8]:
oas = "Data\\OA_2021_EW_BFE_V9.shp"
oas_gdf = gpd.read_file(oas)
oas_gdf = oas_gdf[oas_gdf['LSOA21NM'].apply(lambda x: ('wandsworth' in x.lower()) or ('lambeth' in x.lower()))]
oas_gdf["area_m2"] = oas_gdf.geometry.area
##############
ptal_path = "Data\\PTAL_2023_Grid_100m_100m.shp"
ptal_df = gpd.read_file(ptal_path)
##############
oa_area = 'Data\\Opportunity_Areas.shp'
oa_area_gdf = gpd.read_file(oa_area)
wand_oa_area_gdf = oa_area_gdf[oa_area_gdf['borough'] == 'Lambeth, Wandsworth'].copy()
wand_oa_area_gdf = wand_oa_area_gdf.to_crs(oas_gdf.crs)
################
print(oas_gdf.shape)
oas_area_wandsworth = oas_gdf.sjoin(wand_oa_area_gdf, predicate = 'intersects'
).drop(columns = ['index_right'])
oas_area_wandsworth.columns = [c.lower() for c in oas_area_wandsworth.columns]
oas_area_wandsworth = oas_area_wandsworth.rename(columns = {'long' : 'oa_long', 'lat' : 'oa_lat'})
oas_area_wandsworth = gpd.GeoDataFrame(oas_area_wandsworth,
geometry=oas_area_wandsworth['geometry'], crs="EPSG:27700")
print(oas_area_wandsworth.shape)
(2045, 11) (111, 38)
In [12]:
gdf_all = load_data("Data/final_br_census_oa_df.pkl", year_col='year', lat_col='lat', lon_col='long')
categories = ['Retail', 'Leisure', 'Office', 'Industrial']
tmp1 = gdf_all[(gdf_all['source'] == 'Census') & (gdf_all['year'] == 2012)].copy()
tmp1['year'] = 2011
tmp2 = gdf_all[(gdf_all['source'] == 'Both') & (gdf_all['year'] == 2012)].copy()
tmp2['year'] = 2011
tmp3 = gdf_all[(gdf_all['source'] == 'Census') & (gdf_all['year'] == 2012)].copy()
tmp3['year'] = 2010
tmp4 = gdf_all[(gdf_all['source'] == 'Both') & (gdf_all['year'] == 2012)].copy()
tmp4['year'] = 2010
gdf_all = pd.concat([gdf_all, tmp1, tmp2, tmp3, tmp4])
gdf_all['scat_category'] = gdf_all['scat_category'].apply(lambda x: x if x in categories else 'Other')
In [13]:
final_pop_oa_df = pd.read_pickle("Data/final_pop_oa_df.pkl")
oa_final_df = pd.read_pickle("Data/VNEB_oa_final_df.pkl")
oa_final_df['bus_density'] = (oa_final_df['bus_count'] / oa_final_df['area_m2'] * 1000000).apply(np.ceil)
Create Stations Dataframe¶
In [14]:
battersea_station = (51.479774, -0.1418745) # lat, lon
nine_elms = (51.4799781, -0.1285638)
stations_df = gpd.GeoDataFrame({
'name': ['Battersea Power Station', 'Nine Elms'],
'geometry': [Point(battersea_station[1], battersea_station[0]), # lon, lat
Point(nine_elms[1], nine_elms[0])]
}, crs='EPSG:4326')
stations_df_tmp = stations_df.to_crs(epsg=3857)
stations_df
Out[14]:
| name | geometry | |
|---|---|---|
| 0 | Battersea Power Station | POINT (-0.14187 51.47977) |
| 1 | Nine Elms | POINT (-0.12856 51.47998) |
Plot Overaall and Industry-Specific Business Density¶
In [6]:
years = [2012, 2015, 2018, 2021, 2024]
industry = 'All'
combined_df = oa_final_df[(oa_final_df['year'].isin(years)) &
(oa_final_df['industry'] == industry)].copy()
oas_combined = oas_area_wandsworth.set_index('oa21cd').join(
combined_df.set_index('oa21cd')[['bus_count']]
).reset_index()
oas_combined = oas_combined.to_crs(epsg=3857)
vmin = oas_combined['bus_count'].min()
vmax = oas_combined['bus_count'].max()
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
cmap = cm.viridis
fig, axes = plt.subplots(2, 3, figsize=(38, 30), sharex=True, sharey=True)
axes = axes.flatten()
for i, year in enumerate(years):
ax = axes[i]
oa_final_df_year = oa_final_df[(oa_final_df['year'] == year) &
(oa_final_df['industry'] == industry)].copy()
oas_year = oas_area_wandsworth.set_index('oa21cd').join(
oa_final_df_year.set_index('oa21cd')[['bus_density', 'bus_count']]
).reset_index()
oas_year = oas_year.to_crs(epsg=3857)
oas_year.plot(
ax=ax,
column='bus_count',
cmap=cmap,
norm=norm,
edgecolor='white',
linewidth=0.5,
legend=False
)
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik, crs=oas_year.crs)
ax.set_title(f'Business Count in {year}', fontsize=24, weight='bold')
ax.set_axis_off()
axes[-1].axis('off')
sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm._A = []
cbar = fig.colorbar(sm, ax=axes[:5], orientation='horizontal', fraction=0.02, pad=0)
cbar.set_label('Business Count', fontsize=16)
plt.suptitle("Wandsworth OA Business Count by Year", fontsize=30, weight='bold', y=1)
plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.show()
In [7]:
years = [2012, 2015, 2018, 2021, 2024]
industry = 'Office'
combined_df = oa_final_df[(oa_final_df['year'].isin(years)) &
(oa_final_df['industry'] == industry)].copy()
oas_combined = oas_area_wandsworth.set_index('oa21cd').join(
combined_df.set_index('oa21cd')[['bus_count']]
).reset_index()
oas_combined = oas_combined.to_crs(epsg=3857)
vmin = oas_combined['bus_count'].min()
vmax = oas_combined['bus_count'].max()
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
cmap = cm.viridis
fig, axes = plt.subplots(2, 3, figsize=(38, 30), sharex=True, sharey=True)
axes = axes.flatten()
for i, year in enumerate(years):
ax = axes[i]
# Filter data for the specific year and industry
oa_final_df_year = oa_final_df[(oa_final_df['year'] == year) &
(oa_final_df['industry'] == industry)].copy()
oas_year = oas_area_wandsworth.set_index('oa21cd').join(
oa_final_df_year.set_index('oa21cd')[['bus_density', 'bus_count']]
).reset_index()
oas_year = oas_year.to_crs(epsg=3857)
oas_year.plot(
ax=ax,
column='bus_count',
cmap=cmap,
norm=norm,
edgecolor='white',
linewidth=0.5,
legend=False
)
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik, crs=oas_year.crs)
ax.set_title(f'Business Count in {year}', fontsize=24, weight='bold')
ax.set_axis_off()
axes[-1].axis('off')
sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm._A = []
cbar = fig.colorbar(sm, ax=axes[:5], orientation='horizontal', fraction=0.02, pad=0)
cbar.set_label('Business Count', fontsize=16)
plt.suptitle("Wandsworth OA Business Count by Year", fontsize=30, weight='bold', y=1)
plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.show()
In [8]:
years = [2012, 2015, 2018, 2021, 2024]
industry = 'Retail'
combined_df = oa_final_df[(oa_final_df['year'].isin(years)) &
(oa_final_df['industry'] == industry)].copy()
oas_combined = oas_area_wandsworth.set_index('oa21cd').join(
combined_df.set_index('oa21cd')[['bus_count']]
).reset_index()
oas_combined = oas_combined.to_crs(epsg=3857)
vmin = oas_combined['bus_count'].min()
vmax = oas_combined['bus_count'].max()
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
cmap = cm.viridis
fig, axes = plt.subplots(2, 3, figsize=(38, 30), sharex=True, sharey=True)
axes = axes.flatten()
for i, year in enumerate(years):
ax = axes[i]
oa_final_df_year = oa_final_df[(oa_final_df['year'] == year) &
(oa_final_df['industry'] == industry)].copy()
oas_year = oas_area_wandsworth.set_index('oa21cd').join(
oa_final_df_year.set_index('oa21cd')[['bus_density', 'bus_count']]
).reset_index()
oas_year = oas_year.to_crs(epsg=3857)
oas_year.plot(
ax=ax,
column='bus_count',
cmap=cmap,
norm=norm,
edgecolor='white',
linewidth=0.5,
legend=False
)
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik, crs=oas_year.crs)
ax.set_title(f'Business Count in {year}', fontsize=24, weight='bold')
ax.set_axis_off()
axes[-1].axis('off')
sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm._A = []
cbar = fig.colorbar(sm, ax=axes[:5], orientation='horizontal', fraction=0.02, pad=0)
cbar.set_label('Business Count', fontsize=16)
plt.suptitle("Wandsworth OA Business Count by Year", fontsize=30, weight='bold', y=1)
plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.show()
In [9]:
years = [2012, 2015, 2018, 2021, 2024]
industry = 'Leisure'
combined_df = oa_final_df[(oa_final_df['year'].isin(years)) &
(oa_final_df['industry'] == industry)].copy()
oas_combined = oas_area_wandsworth.set_index('oa21cd').join(
combined_df.set_index('oa21cd')[['bus_count']]
).reset_index()
oas_combined = oas_combined.to_crs(epsg=3857)
vmin = oas_combined['bus_count'].min()
vmax = oas_combined['bus_count'].max()
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
cmap = cm.viridis
fig, axes = plt.subplots(2, 3, figsize=(38, 30), sharex=True, sharey=True)
axes = axes.flatten()
for i, year in enumerate(years):
ax = axes[i]
oa_final_df_year = oa_final_df[(oa_final_df['year'] == year) &
(oa_final_df['industry'] == industry)].copy()
oas_year = oas_area_wandsworth.set_index('oa21cd').join(
oa_final_df_year.set_index('oa21cd')[['bus_density', 'bus_count']]
).reset_index()
oas_year = oas_year.to_crs(epsg=3857)
oas_year.plot(
ax=ax,
column='bus_count',
cmap=cmap,
norm=norm,
edgecolor='white',
linewidth=0.5,
legend=False
)
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik, crs=oas_year.crs)
ax.set_title(f'Business Count in {year}', fontsize=24, weight='bold')
ax.set_axis_off()
axes[-1].axis('off')
sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm._A = []
cbar = fig.colorbar(sm, ax=axes[:5], orientation='horizontal', fraction=0.02, pad=0)
cbar.set_label('Business Count', fontsize=16)
plt.suptitle("Wandsworth OA Business Count by Year", fontsize=30, weight='bold', y=1)
plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.show()
In [10]:
years = [2012, 2015, 2018, 2021, 2024]
industry = 'Industrial'
combined_df = oa_final_df[(oa_final_df['year'].isin(years)) &
(oa_final_df['industry'] == industry)].copy()
oas_combined = oas_area_wandsworth.set_index('oa21cd').join(
combined_df.set_index('oa21cd')[['bus_count']]
).reset_index()
oas_combined = oas_combined.to_crs(epsg=3857)
vmin = oas_combined['bus_count'].min()
vmax = oas_combined['bus_count'].max()
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
cmap = cm.viridis
fig, axes = plt.subplots(2, 3, figsize=(38, 30), sharex=True, sharey=True)
axes = axes.flatten()
for i, year in enumerate(years):
ax = axes[i]
oa_final_df_year = oa_final_df[(oa_final_df['year'] == year) &
(oa_final_df['industry'] == industry)].copy()
oas_year = oas_area_wandsworth.set_index('oa21cd').join(
oa_final_df_year.set_index('oa21cd')[['bus_density', 'bus_count']]
).reset_index()
oas_year = oas_year.to_crs(epsg=3857)
oas_year.plot(
ax=ax,
column='bus_count',
cmap=cmap,
norm=norm,
edgecolor='white',
linewidth=0.5,
legend=False
)
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik, crs=oas_year.crs)
ax.set_title(f'Business Count in {year}', fontsize=24, weight='bold')
ax.set_axis_off()
axes[-1].axis('off')
sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm._A = []
cbar = fig.colorbar(sm, ax=axes[:5], orientation='horizontal', fraction=0.02, pad=0)
cbar.set_label('Business Count', fontsize=16)
plt.suptitle("Wandsworth OA Business Count by Year", fontsize=30, weight='bold', y=1)
plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.show()
In [11]:
final_pop_oa_df[['year', 'total_pop', 'pop_19_64']].groupby(['year']).sum()
stations_df = gpd.GeoDataFrame({
'name': ['Battersea Power Station', 'Nine Elms'],
'geometry': [Point(battersea_station[1], battersea_station[0]), # lon, lat
Point(nine_elms[1], nine_elms[0])]
}, crs='EPSG:4326')
In [12]:
filtered_jobs_df = pd.read_pickle("filtered_jobs_df.pkl")
filtered_jobs_df = filtered_jobs_df[filtered_jobs_df['industry'] == 'All']
In [13]:
buffer = 750
buffer_stations_df = stations_df_tmp.copy()
buffer_stations_df = buffer_stations_df.to_crs(gdf_all.crs)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6
tmp = oa_final_df[oa_final_df['industry'] == 'All'].copy()
tmp['geometry'] = tmp.apply(lambda x: Point(x['oa_long'], x['oa_lat']), axis = 1)
tmp = gpd.GeoDataFrame(tmp, geometry = tmp['geometry'], crs = 'EPSG:4326')
tmp = tmp.to_crs(gdf_all.crs)
joined = tmp.sjoin(buffer_stations_df, predicate='within')
oas = joined['oa21cd'].unique()
t = filtered_jobs_df[filtered_jobs_df['oa21cd'].isin(oas)].groupby('year').sum()[['jobs_count']]
t1 = final_pop_oa_df[final_pop_oa_df['oa21cd'].isin(oas)].groupby('year').sum()[['total_pop','pop_19_64']]
t = t.join(t1).reset_index()
t['job_percent'] = t['jobs_count'] / t['total_pop']
t['job_percent1'] = t['jobs_count'] / t['pop_19_64']
Run HDBSCAN Clustering Overtime¶
In [15]:
gdf_all_tmp = gdf_all.to_crs("EPSG:4326")
results, cluster_silh_df, final_gdf_clustered, transitions, cluster_trends, cluster_trends_pivot, cluster_proportion_trends_pivot, \
sector_dist, dominant_sectors, access_dist, dominant_access, gini_by_cluster, access_gini_by_cluster = analyze_over_time(
gdf_all_tmp, metric='haversine')
(1654, 54) (96, 4) After trying the ranges for min and epis, the best values are (15, 0) with score is 0.5206878326553299 Year: 2010 Moran's I: 0.0363, p-value: 0.1290 Clusters found: 34 (1697, 54) (96, 4) After trying the ranges for min and epis, the best values are (15, 0) with score is 0.5091418924892184 Year: 2011 Moran's I: 0.0407, p-value: 0.1240 Clusters found: 34 (1694, 54) (96, 4) After trying the ranges for min and epis, the best values are (15, 0) with score is 0.49652268697254526 Year: 2012 Moran's I: 0.0410, p-value: 0.1180 Clusters found: 33 (1897, 54) (96, 4) After trying the ranges for min and epis, the best values are (15, 0) with score is 0.5106155314098126 Year: 2013 Moran's I: 0.0778, p-value: 0.0430 Clusters found: 36 (1940, 54) (97, 4) After trying the ranges for min and epis, the best values are (15, 0) with score is 0.4812629159403616 Year: 2014 Moran's I: 0.0869, p-value: 0.0280 Clusters found: 35 (2004, 54) (97, 4) After trying the ranges for min and epis, the best values are (15, 0) with score is 0.449516854287888 Year: 2015 Moran's I: 0.0842, p-value: 0.0330 Clusters found: 40 (2120, 54) (99, 4) After trying the ranges for min and epis, the best values are (15, 0) with score is 0.41707546042537463 Year: 2016 Moran's I: 0.0710, p-value: 0.0390 Clusters found: 44 (2530, 54) (104, 4) After trying the ranges for min and epis, the best values are (15, 0) with score is 0.4452814908612487 Year: 2017 Moran's I: 0.0687, p-value: 0.0440 Clusters found: 48 (2653, 54) (105, 4) After trying the ranges for min and epis, the best values are (15, 0) with score is 0.4630512364864433 Year: 2018 Moran's I: 0.0553, p-value: 0.0780 Clusters found: 53 (2966, 54) (110, 4) After trying the ranges for min and epis, the best values are (15, 0) with score is 0.4367253113781164 Year: 2019 Moran's I: 0.0252, p-value: 0.1820 Clusters found: 58 (3018, 54) (110, 4) After trying the ranges for min and epis, the best values are (15, 0) with score is 0.44007158394547174 Year: 2020 Moran's I: 0.0160, p-value: 0.2560 Clusters found: 61 (3872, 54) (111, 4) After trying the ranges for min and epis, the best values are (15, 0) with score is 0.5013442274578016 Year: 2021 Moran's I: -0.0077, p-value: 0.4380 Clusters found: 69 (4207, 54) (111, 4) After trying the ranges for min and epis, the best values are (15, 0) with score is 0.4905271081684077 Year: 2022 Moran's I: -0.0090, p-value: 0.4370 Clusters found: 74 (4718, 54) (111, 4) After trying the ranges for min and epis, the best values are (15, 0) with score is 0.5347284507838003 Year: 2023 Moran's I: -0.0150, p-value: 0.4980 Clusters found: 83 (5336, 54) (111, 4) After trying the ranges for min and epis, the best values are (15, 0) with score is 0.5672692177179313 Year: 2024 Moran's I: -0.0163, p-value: 0.4910 Clusters found: 95
Plot Business Densities, Cluster Counts and Densities and CLusters around NLE Stations¶
In [19]:
buffer = 500
buffer_stations_df = stations_df_tmp.copy()
buffer_stations_df = buffer_stations_df.to_crs(final_gdf_clustered.crs)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6
joined = gpd.sjoin(final_gdf_clustered, buffer_stations_df, how='inner', predicate='within')
business_counts = joined.groupby(['index_right', 'year']).size().reset_index(name='business_count')
stations_with_density = buffer_stations_df.join(
business_counts.set_index('index_right')[['business_count', 'year']]).fillna(0).rename(columns = {
'year' : 'Year', 'name' : 'Station Name'})
stations_with_density['Business Density'] = stations_with_density['business_count'] / stations_with_density['buffer_area_km2']
In [262]:
buffers = [500, 750]
for buffer in buffers:
buffer_stations_df = stations_df_tmp.copy()
buffer_stations_df = buffer_stations_df.to_crs(final_gdf_clustered.crs)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6
joined = gpd.sjoin(final_gdf_clustered, buffer_stations_df, how='inner', predicate='within')
business_counts = joined.groupby(['index_right', 'year']).size().reset_index(name='business_count')
stations_with_density = buffer_stations_df.join(
business_counts.set_index('index_right')[['business_count', 'year']]).fillna(0).rename(columns = {
'year' : 'Year', 'name' : 'Station Name'})
stations_with_density['Business Density'] = stations_with_density['business_count'] / stations_with_density['buffer_area_km2']
sns.lineplot(data=stations_with_density, x='Year', y='Business Density', palette='Set2',
hue = 'Station Name', linewidth=2.5, marker='o')
plt.title(f"Business Density within Stations {buffer}m Buffer")
plt.show()
for buffer in buffers:
buffer_stations_df = stations_df_tmp.copy()
buffer_stations_df = buffer_stations_df.to_crs(final_gdf_clustered.crs)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6
joined = gpd.sjoin(final_gdf_clustered, buffer_stations_df, how='inner', predicate='within')
joined = joined[joined['scat_category'] != 'Other']
business_counts = joined.groupby(['index_right', 'year', 'scat_category']).size().reset_index(name='business_count')
stations_with_density = buffer_stations_df.join(
business_counts.set_index('index_right')[['business_count', 'scat_category', 'year']]).fillna(0).rename(columns = {
'year' : 'Year', 'name' : 'Station Name', 'scat_category' : 'Sector'})
stations_with_density['Station Name Sector'] = stations_with_density.apply(lambda x: ' - '.join(
[x['Station Name'], x['Sector']]), axis = 1)
stations_with_density['Business Density'] = stations_with_density['business_count'] / stations_with_density['buffer_area_km2']
plt.figure(figsize=(8, 5))
sns.lineplot(data=stations_with_density, x='Year', y='Business Density', palette='Set2',
hue = 'Station Name Sector', linewidth=2.5, marker='o')
plt.title(f"Business Density within Stations {buffer}m Buffer")
plt.show()
In [253]:
buffer = 500
buffer_stations_df = stations_df_tmp.copy()
buffer_stations_df = buffer_stations_df.to_crs(final_gdf_clustered.crs)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6
joined = gpd.sjoin(final_gdf_clustered, buffer_stations_df, how='inner', predicate='within')
business_counts = joined.groupby(['index_right', 'year']).size().reset_index(name='business_count')
stations_with_density1 = buffer_stations_df.join(
business_counts.set_index('index_right')[['business_count', 'year']]).fillna(0).rename(columns = {
'year' : 'Year', 'name' : 'Station Name'})
stations_with_density1['Business Density'] = stations_with_density1['business_count'] / stations_with_density1['buffer_area_km2']
stations_with_density1['Buffer Distance'] = '500m'
buffer = 750
buffer_stations_df = stations_df_tmp.copy()
buffer_stations_df = buffer_stations_df.to_crs(final_gdf_clustered.crs)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6
joined = gpd.sjoin(final_gdf_clustered, buffer_stations_df, how='inner', predicate='within')
business_counts = joined.groupby(['index_right', 'year']).size().reset_index(name='business_count')
stations_with_density2 = buffer_stations_df.join(
business_counts.set_index('index_right')[['business_count', 'year']]).fillna(0).rename(columns = {
'year' : 'Year', 'name' : 'Station Name'})
stations_with_density2['Business Density'] = stations_with_density2['business_count'] / stations_with_density2['buffer_area_km2']
stations_with_density2['Buffer Distance'] = '750m'
stations_with_density = pd.concat([stations_with_density1, stations_with_density2])[[
'Year', 'Buffer Distance', 'Business Density']].groupby(['Year', 'Buffer Distance']).mean().reset_index()
sns.lineplot(data=stations_with_density, x='Year', y='Business Density', palette='Set2', hue = 'Buffer Distance',
linewidth=2.5, marker='o')
plt.title("Business Density Close to New Stations")
plt.show()
In [261]:
############################
buffer = 500
buffer_stations_df = stations_df_tmp.copy()
buffer_stations_df = buffer_stations_df.to_crs(final_gdf_clustered.crs)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6
joined = gpd.sjoin(final_gdf_clustered, buffer_stations_df, how='inner', predicate='within')
joined = joined[joined['scat_category'] != 'Other']
business_counts = joined.groupby(['index_right', 'year', 'scat_category']).size().reset_index(name='business_count')
stations_with_density1 = buffer_stations_df.join(
business_counts.set_index('index_right')[['business_count', 'scat_category', 'year']]).fillna(0).rename(columns = {
'year' : 'Year', 'name' : 'Station Name', 'scat_category' : 'Sector'})
stations_with_density1['Buffer Distance'] = '500m'
stations_with_density1['Sector Buffer Distance'] = stations_with_density1.apply(lambda x: ' - '.join(
[x['Sector'], x['Buffer Distance']]), axis = 1)
stations_with_density1['Business Density'] = stations_with_density1['business_count'] / stations_with_density1['buffer_area_km2']
buffer = 750
buffer_stations_df = stations_df_tmp.copy()
buffer_stations_df = buffer_stations_df.to_crs(final_gdf_clustered.crs)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6
joined = gpd.sjoin(final_gdf_clustered, buffer_stations_df, how='inner', predicate='within')
joined = joined[joined['scat_category'] != 'Other']
business_counts = joined.groupby(['index_right', 'year', 'scat_category']).size().reset_index(name='business_count')
stations_with_density2 = buffer_stations_df.join(
business_counts.set_index('index_right')[['business_count', 'scat_category', 'year']]).fillna(0).rename(columns = {
'year' : 'Year', 'name' : 'Station Name', 'scat_category' : 'Sector'})
stations_with_density2['Buffer Distance'] = '750m'
stations_with_density2['Sector Buffer Distance'] = stations_with_density2.apply(lambda x: ' - '.join(
[x['Sector'], x['Buffer Distance']]), axis = 1)
stations_with_density2['Business Density'] = stations_with_density2['business_count'] / stations_with_density2['buffer_area_km2']
stations_with_density = pd.concat([stations_with_density1, stations_with_density2])[[
'Year', 'Sector Buffer Distance', 'Business Density']].groupby(['Year',
'Sector Buffer Distance']).mean().reset_index()
plt.figure(figsize=(8, 5))
sns.lineplot(data=stations_with_density, x='Year', y='Business Density', palette='Set2', hue = 'Sector Buffer Distance',
linewidth=2.5, marker='o')
plt.title("Business Density Close to New Stations by Industry")
plt.show()
In [55]:
t = final_gdf_clustered[['year','cluster']].drop_duplicates()
t = t.sort_values(by = 'year')
t = t.groupby('year').count().rename(columns = {'cluster' : 'Cluster Count'})
plt.figure(figsize=(7, 4))
sns.lineplot(data=t, x='year', y='Cluster Count', palette='Set2', linewidth=2.5, marker='o')
plt.title("Overall VNEB Cluster Counts")
Out[55]:
Text(0.5, 1.0, 'Overall VNEB Cluster Counts')
Plot Silhoutte Score Overtime and For 2024 Year Over Search Space¶
In [53]:
cluster_silh_df_tmp = cluster_silh_df.copy()
cluster_silh_df_tmp['Hyper Parameters'] = cluster_silh_df_tmp.apply(lambda x: ' - '.join(
[str(int(x['min_cluster_size'])), str(int(x['epsilon_m']))]), axis = 1)
plt.figure(figsize=(7, 4))
sns.lineplot(data = cluster_silh_df_tmp[(cluster_silh_df_tmp['year'] == 2024)],
x='Hyper Parameters', y='silhouette', palette='Set2', linewidth=2.5, marker='o')
plt.xticks(rotation=45, ha='right', fontsize=7)
plt.title("Clustering Silhouette Score for 2024")
Out[53]:
Text(0.5, 1.0, 'Clustering Silhouette Score for 2024')
In [54]:
plt.figure(figsize=(7, 4))
sns.lineplot(data = cluster_silh_df[(cluster_silh_df['min_cluster_size'] == 15) &
(cluster_silh_df['epsilon_m'] == 0)], x='year', y='silhouette', palette='Set2', linewidth=2.5, marker='o')
plt.title("Clustering Silhouette Score over time")
Out[54]:
Text(0.5, 1.0, 'Clustering Silhouette Score over time')
Generate Other Clusters Plots and Entropy (Diversity Plots)¶
In [27]:
years = [2012, 2015, 2018, 2021, 2024]
sns.lineplot(gini_by_cluster[gini_by_cluster['cluster'] != -1], x = 'year', y = 'gini_index')
plt.title('Business Clusters Sector Gini Index Over Time')
plt.xlabel('Date')
plt.ylabel('Sector Gini Index')
plt.tight_layout()
plt.show()
sns.lineplot(gini_by_cluster[gini_by_cluster['cluster'] != -1].groupby('year').mean()['gini_index'])
plt.title('Average Business Clusters Sector Gini Index Over Time')
plt.xlabel('Date')
plt.ylabel('Average Sector Gini Index')
plt.tight_layout()
plt.show()
sns.lineplot(access_gini_by_cluster[access_gini_by_cluster['cluster'] != -1], x = 'year', y = 'gini_index')
plt.title('Business Clusters Accessibility Gini Index Over Time')
plt.xlabel('Date')
plt.ylabel('Accessibility Gini Index')
plt.tight_layout()
plt.show()
sns.lineplot(access_gini_by_cluster[access_gini_by_cluster['cluster'] != -1].groupby('year').mean()['gini_index'])
plt.title('Average Business Clusters Accessibility Gini Index Over Time')
plt.xlabel('Date')
plt.ylabel('Average Accessibility Gini Index')
plt.tight_layout()
plt.show()
sns.lineplot(cluster_trends[cluster_trends['cluster'] != -1], x = 'year', y = 'count')
plt.title('Business Cluster Sizes Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Businesses')
plt.tight_layout()
plt.show()
sns.lineplot(cluster_trends[cluster_trends['cluster'] == -1], x = 'year', y = 'count')
plt.title('Business Noise Size Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Businesses')
plt.tight_layout()
plt.show()
sns.lineplot(cluster_trends[cluster_trends['cluster'] != -1], x = 'year', y = 'proportion')
plt.title('Business Cluster Proportion Over Time')
plt.xlabel('Date')
plt.ylabel('Proportion of Businesses')
plt.tight_layout()
plt.show()
sns.lineplot(cluster_trends[cluster_trends['cluster'] == -1], x = 'year', y = 'proportion')
plt.title('Business Noise Proportion Over Time')
plt.xlabel('Date')
plt.ylabel('Proportion of Businesses')
plt.tight_layout()
plt.show()
cluster_trends_pivot[cluster_trends_pivot.columns[1:]].plot(figsize=(12,6))
plt.title('Business Cluster Sizes Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Businesses')
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()
cluster_proportion_trends_pivot[cluster_proportion_trends_pivot.columns[1:]].plot(figsize=(12,6))
plt.title('Business Cluster Proportion Over Time')
plt.xlabel('Date')
plt.ylabel('Proportion of Businesses')
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()
pivot = access_gini_by_cluster[(access_gini_by_cluster['cluster'] != -1)]
pivot = pivot.pivot_table(index='year', columns='cluster',
values='gini_index', aggfunc='sum').fillna(0)
sns.heatmap(pivot, cmap='viridis', fmt='g')
plt.title('Accessibility Gini Index Distribution Across Clusters')
plt.show()
pivot = gini_by_cluster[(gini_by_cluster['cluster'] != -1)]
pivot = pivot.pivot_table(index='year', columns='cluster',
values='gini_index', aggfunc='sum').fillna(0)
sns.heatmap(pivot, cmap='viridis', fmt='g')
plt.title('Sector Gini Index Distribution Across Clusters')
plt.show()
pivot = sector_dist.pivot_table(index='scat_category', columns='cluster',
values='count', aggfunc='sum').fillna(0)
sns.heatmap(pivot[pivot.columns[1:]], cmap='viridis', fmt='g')
plt.title('Sector Distribution Across Clusters')
plt.show()
pivot = access_dist.pivot_table(index='PTAL2023_main', columns='cluster',
values='count', aggfunc='sum').fillna(0)
sns.heatmap(pivot[pivot.columns[1:]], cmap='viridis', fmt='g')
plt.title('Accessibility Distribution Across Clusters')
plt.show()
fig, axes = plt.subplots(1, 5, figsize=(20, 4))
for i, ax in enumerate(axes):
year = years[i]
pivot = sector_dist[sector_dist['year'] == year].pivot_table(index='scat_category', columns='cluster',
values='count', aggfunc='sum').fillna(0)
sns.heatmap(pivot[pivot.columns[1:]], ax=ax, cmap='viridis', cbar=(i == 4), fmt='g') # Show colorbar only on last
ax.set_title(f'Sector-Clusters {year}')
fig, axes = plt.subplots(1, 5, figsize=(20, 4))
for i, ax in enumerate(axes):
year = years[i]
pivot = access_dist[access_dist['year'] == year].pivot_table(index='PTAL2023_main', columns='cluster',
values='count', aggfunc='sum').fillna(0)
sns.heatmap(pivot[pivot.columns[1:]], ax=ax, cmap='viridis', cbar=(i == 4), fmt='g') # Show colorbar only on last
ax.set_title(f'Accessibility-Clusters {year}')
for i, year in enumerate(years):
df = final_gdf_clustered.sort_values(by=['CompanyID', 'year']).copy()
df['prev_cluster'] = df.groupby('CompanyID')['cluster'].shift(1)
df['prev_year'] = df.groupby('CompanyID')['year'].shift(1)
transitions_tmp = df[df['year'] == year].dropna(subset=['prev_cluster'])
transitions_tmp['source'] = transitions_tmp['prev_year'].astype(int).astype(str) + '_C' + transitions_tmp['prev_cluster'].astype(int).astype(str)
transitions_tmp['target'] = transitions_tmp['year'].astype(int).astype(str) + '_C' + transitions_tmp['cluster'].astype(int).astype(str)
transition_counts = transitions_tmp.groupby(['source', 'target']).size().reset_index(name='count')
all_nodes = pd.unique(transition_counts[['source', 'target']].values.ravel())
node_mapping = {label: idx for idx, label in enumerate(all_nodes)}
transition_counts['source_idx'] = transition_counts['source'].map(node_mapping)
transition_counts['target_idx'] = transition_counts['target'].map(node_mapping)
fig = go.Figure(data=[go.Sankey(
node=dict(
pad=15,
thickness=20,
line=dict(color='black', width=0.5),
label=all_nodes,
color='lightblue'),
link=dict(
source=transition_counts['source_idx'],
target=transition_counts['target_idx'],
value=transition_counts['count']))])
fig.update_layout(title_text=f"Cluster Transitions Sankey {year}",font_size=12, width=1000, height=500)
fig.show()
fig, axes = plt.subplots(1, 5, figsize=(20, 5), sharex=True, sharey=True)
for i, year in enumerate(years):
ax = axes[i]
tmp_df = final_gdf_clustered[final_gdf_clustered['year'] == year].copy()
if tmp_df.crs != 'EPSG:3857':
tmp_df = tmp_df.to_crs(epsg=3857)
tmp_df.plot(column='cluster', cmap='tab20', legend=False, ax=ax, markersize=5)
ax.scatter(stations_df_tmp.geometry.x, stations_df_tmp.geometry.y, color='blue',marker='*',s=300,label='Station',zorder=5)
ctx.add_basemap(ax, source=ctx.providers.CartoDB.PositronNoLabels, crs=tmp_df.crs)
ax.set_title(f'Year: {year}', fontsize=12)
ax.set_axis_off()
plt.tight_layout()
plt.show()
############################
if stations_df_tmp.crs != 'EPSG:3857':
stations_df_tmp = stations_df_tmp.to_crs(epsg=3857)
stations_buffer1 = stations_df_tmp.copy()
stations_buffer1['geometry'] = stations_buffer1.buffer(500)
stations_buffer2 = stations_df_tmp.copy()
stations_buffer2['geometry'] = stations_buffer2.buffer(750)
cluster_ids = final_gdf_clustered[final_gdf_clustered['cluster'] != -1]['cluster'].unique()
cluster_ids = sorted(cluster_ids)
n_clusters = len(cluster_ids)
cmap = cm.get_cmap('tab20', n_clusters)
norm = mcolors.BoundaryNorm(boundaries=np.arange(min(cluster_ids)-0.5, max(cluster_ids)+1.5), ncolors=n_clusters)
for year in years:
fig, ax = plt.subplots(figsize=(8, 5))
tmp_df = final_gdf_clustered[(final_gdf_clustered['year'] == year) & (final_gdf_clustered['cluster'] != -1)
].copy()
if tmp_df.crs != 'EPSG:3857':
tmp_df = tmp_df.to_crs(epsg=3857)
sc = tmp_df.plot(ax=ax, column='cluster', cmap=cmap, markersize=5, legend=False, zorder=3)
stations_buffer1.plot(ax=ax, color='none', edgecolor='black', linestyle='--', linewidth=1.2, alpha=0.6, zorder=2)
stations_buffer2.plot(ax=ax, color='none', edgecolor='black', linestyle='--', linewidth=1.2, alpha=0.6, zorder=2)
stations_df_tmp.plot(ax=ax, color='black', marker='o', markersize=50, label='Station', zorder=4)
ctx.add_basemap(ax, source=ctx.providers.CartoDB.PositronNoLabels, crs=tmp_df.crs)
ax.set_title(f'Business Clusters – Year: {year}', fontsize=14)
ax.set_axis_off()
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([]) # Only needed for colorbar
cbar = fig.colorbar(sm, ax=ax, orientation='vertical', shrink=0.6, pad=0.01)
cbar.set_label('Cluster ID', fontsize=10)
plt.tight_layout()
plt.show()
#######################extra analysis##############################
final_gdf_clustered['y'] = final_gdf_clustered.geometry.y
final_gdf_clustered['x'] = final_gdf_clustered.geometry.x
new_clusters = []
metrics = []
for year in final_gdf_clustered['year'].unique():
df_year = final_gdf_clustered[final_gdf_clustered['year'] == year]
n_clusters = df_year['cluster'].nunique() - (1 if -1 in df_year['cluster'].unique() else 0)
cluster_sizes = df_year[df_year['cluster'] != -1].groupby('cluster').size()
mean_size = cluster_sizes.mean()
share_clustered = len(cluster_sizes) * mean_size / len(df_year)
metrics.append({'year': year, 'Clusters Count': n_clusters, 'Cluster Size Average': mean_size,
'Clustered Share': share_clustered})
clusters = df_year.groupby('cluster')
for label, group in clusters:
if label == -1: continue # ignore noise
centroid = Point(group['x'].mean(), group['y'].mean())
b, min_dis, distances = near_station(centroid, stations_df, station_buffer = 500)
b2, min_dis2, distances2 = near_station(centroid, stations_df, station_buffer = 750)
dic = {'year' : [year], 'cluster' : [label], 'Centroid' : [centroid], 'Min Distance' : [min_dis],
'Within Buffer' : [b], 'Buffer Distance' : ['500m']}
dic2 = {'year' : [year], 'cluster' : [label], 'Centroid' : [centroid], 'Min Distance' : [min_dis2],
'Within Buffer' : [b2], 'Buffer Distance' : ['750m']}
for k in distances:
dic[f'{k} Distance'] = [distances[k]]
for k in distances2:
dic2[f'{k} Distance'] = [distances2[k]]
new_clusters.append(pd.DataFrame(dic))
new_clusters.append(pd.DataFrame(dic2))
new_clusters = pd.concat(new_clusters)
domin_new_clusters = new_clusters.set_index(['year', 'cluster']).join(dominant_sectors.set_index(['year', 'cluster'])).reset_index()
sector_new_clusters = new_clusters.set_index(['year', 'cluster']).join(sector_dist.set_index(['year', 'cluster'])).reset_index()
t = domin_new_clusters[domin_new_clusters['Within Buffer']].groupby(['year', 'Buffer Distance']).count()[['cluster']].reset_index().rename(
columns = {'cluster' : 'Cluster Count', 'year' : 'Year'})
tt = domin_new_clusters[domin_new_clusters['Within Buffer']].groupby(['year', 'Buffer Distance', 'scat_category']
).count()[['cluster']].reset_index().rename(columns = {'cluster' : 'Cluster Count',
'scat_category' : 'Sector', 'year' : 'Year'})
tt = tt[tt['Sector'] != 'Other']
tt['Sector Buffer Distance'] = tt.apply(lambda x: ' - '.join([x['Sector'], x['Buffer Distance']]), axis = 1)
sns.lineplot(data=t, x='Year', y='Cluster Count', palette='Set2', hue = 'Buffer Distance', linewidth=2.5, marker='o')
plt.title("Cluster Counts Close to New Stations")
plt.show()
sns.lineplot(data=tt, x='Year', y='Cluster Count', hue='Sector Buffer Distance', palette='Set2', linewidth=2.5, marker='o')
plt.title("Cluster Counts Close to New Stations by Sector")
plt.show()
t = sector_new_clusters[sector_new_clusters['Within Buffer']][['year', 'Buffer Distance', 'count']].groupby(['year', 'Buffer Distance']
).sum().reset_index().rename(columns = {'count' : 'Business Count', 'year' : 'Year'})
tt = sector_new_clusters[sector_new_clusters['Within Buffer']][['year', 'Buffer Distance', 'scat_category', 'count']].groupby([
'year', 'Buffer Distance', 'scat_category']).sum().reset_index().rename(columns = {'count' : 'Business Count',
'scat_category' : 'Sector', 'year' : 'Year'})
tt = tt[tt['Sector'] != 'Other']
tt['Sector Buffer Distance'] = tt.apply(lambda x: ' - '.join([x['Sector'], x['Buffer Distance']]), axis = 1)
sns.lineplot(data=t, x='Year', y='Business Count', palette='Set2', hue = 'Buffer Distance', linewidth=2.5, marker='o')
plt.title("Business Counts Close to New Stations")
plt.show()
sns.lineplot(data=tt, x='Year', y='Business Count', hue='Sector Buffer Distance', palette='Set2', linewidth=2.5, marker='o')
plt.title("Business Counts Close to New Stations by Sector")
plt.show()
sns.lineplot(data=domin_new_clusters[domin_new_clusters['Buffer Distance'] == '500m'], x='year', y='Nine Elms Distance', label='To Nine Elms (500m)')
sns.lineplot(data=domin_new_clusters[domin_new_clusters['Buffer Distance'] == '500m'], x='year', y='Battersea Power Station Distance',
label='To Battersea Power Station (500m)')
sns.lineplot(data=domin_new_clusters[domin_new_clusters['Buffer Distance'] == '750m'], x='year', y='Nine Elms Distance', label='To Nine Elms (750m)')
sns.lineplot(data=domin_new_clusters[domin_new_clusters['Buffer Distance'] == '750m'], x='year', y='Battersea Power Station Distance',
label='To Battersea Power Station (750m)')
plt.ylabel("Mean Distance to Station (m)")
plt.title("Cluster Centroid Proximity Over Time")
plt.show()
##############
metrics_df = pd.DataFrame(metrics)
# Plot
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.lineplot(data=metrics_df, x='year', y='Clusters Count', ax=axes[0]).set_title("Number of Clusters")
sns.lineplot(data=metrics_df, x='year', y='Cluster Size Average', ax=axes[1]).set_title("Avg. Cluster Size")
sns.lineplot(data=metrics_df, x='year', y='Clustered Share', ax=axes[2]).set_title("Share of Clustered Businesses")
plt.tight_layout()
plt.show()
############
noise_stats = final_gdf_clustered.groupby('year')['cluster'].apply(lambda x: (x == -1).sum() / len(x)).reset_index()
noise_stats.columns = ['Year', 'Noise Share']
sns.lineplot(data=noise_stats, x='Year', y='Noise Share')
plt.title("Share of Businesses Not in Clusters (Noise)")
plt.ylabel("Proportion")
plt.show()
#################
diversity_data = []
for year in sorted(final_gdf_clustered['year'].unique()):
df_year = final_gdf_clustered[final_gdf_clustered['year'] == year]
for label, group in df_year.groupby('cluster'):
if label == -1: continue
counts = group['scat_category'].value_counts()
H = entropy(counts)
diversity_data.append({'year': year, 'cluster': label, 'entropy': H})
diversity_df = pd.DataFrame(diversity_data)
sns.lineplot(data=diversity_df.groupby('year')['entropy'].mean().reset_index(), x='year', y='entropy')
plt.title("Mean Business Sector Diversity (Entropy) in Clusters Over Time")
plt.ylabel("Shannon Entropy")
plt.show()
diversity_data = []
for year in sorted(final_gdf_clustered['year'].unique()):
df_year = final_gdf_clustered[final_gdf_clustered['year'] == year]
for label, group in df_year.groupby('cluster'):
if label == -1: continue
counts = group['bus_PTAL_2023'].value_counts()
H = entropy(counts)
diversity_data.append({'year': year, 'cluster': label, 'entropy': H})
diversity_df = pd.DataFrame(diversity_data)
sns.lineplot(data=diversity_df.groupby('year')['entropy'].mean().reset_index(), x='year', y='entropy')
plt.title("Mean Business Accessibility Diversity (Entropy) in Clusters Over Time")
plt.ylabel("Shannon Entropy")
plt.show()
###############################
agg_results = []
for year in sorted(final_gdf_clustered['year'].unique()):
year_df = final_gdf_clustered[final_gdf_clustered['year'] == year]
for cluster_id, group in year_df.groupby('cluster'):
if cluster_id == -1: continue # skip noise
counts = group[group['scat_category'] != 'Other']['scat_category'].value_counts()
H = compute_entropy(counts)
G = compute_gini(counts)
agg_results.append({
'year': year,
'cluster': cluster_id,
'entropy': H,
'gini': G,
'n_businesses': len(group),
'x': group['x'].mean(),
'y': group['y'].mean()
})
agg_df = pd.DataFrame(agg_results)
agg_df['entropy_norm'] = agg_df.groupby('year')['entropy'].transform(
lambda x: (x - x.min()) / (x.max() - x.min())
)
agg_df['gini_norm'] = agg_df.groupby('year')['gini'].transform(
lambda x: (x - x.min()) / (x.max() - x.min())
)
ent_thresh = agg_df['entropy_norm'].quantile(0.7)
gini_thresh = agg_df['gini_norm'].quantile(0.3)
agg_df['Agglomeration Type'] = agg_df.apply(lambda x: classify_row(x, ent_thresh=gini_thresh, gini_thresh=ent_thresh, entrop_col = 'entropy_norm',
gini_col = 'gini_norm'), axis=1)
trend_df = agg_df.groupby(['year', 'Agglomeration Type']).size().reset_index(name='count').set_index(['year', 'Agglomeration Type'
]).join(agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year', 'Agglomeration Type']).sum()[['n_businesses']]
).reset_index()
trend_df = agg_df.groupby(['year']).size().reset_index(name='count').set_index(['year']).join(
agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year']).sum()[['n_businesses']]
).rename(columns = {'count' : 'Total Count', 'n_businesses': 'Total Businesses'}).join(
trend_df.set_index('year')).reset_index()
trend_df['Cluster Proportions'] = trend_df['count'] / trend_df['Total Count']
trend_df['Businesses Proportion'] = trend_df['n_businesses'] / trend_df['Total Businesses']
plt.figure(figsize=(10, 6))
sns.lineplot(data=trend_df, x='year', y='count', hue='Agglomeration Type', marker='o')
plt.title("Agglomeration Type Counts Over Time")
plt.ylabel("Number of Clusters")
plt.xlabel("Year")
plt.grid(True)
plt.show()
plt.figure(figsize=(10, 6))
sns.lineplot(data=trend_df, x='year', y='n_businesses', hue='Agglomeration Type', marker='o')
plt.title("Agglomeration Type Clusters Total Businesses Over Time")
plt.ylabel("Number of Businesses")
plt.xlabel("Year")
plt.grid(True)
plt.show()
plt.figure(figsize=(10, 6))
sns.lineplot(data=trend_df, x='year', y='Cluster Proportions', hue='Agglomeration Type', marker='o')
plt.title("Agglomeration Type Clusters Proportion Over Time")
plt.ylabel("Number of Clusters")
plt.xlabel("Year")
plt.grid(True)
plt.show()
plt.figure(figsize=(10, 6))
sns.lineplot(data=trend_df, x='year', y='Businesses Proportion', hue='Agglomeration Type', marker='o')
plt.title("Agglomeration Type Clusters Businesses Proportion Over Time")
plt.ylabel("Number of Clusters")
plt.xlabel("Year")
plt.grid(True)
plt.show()
################
centroid = Point(group['x'].mean(), group['y'].mean())
agg_df['_'], agg_df['Min Distance'], agg_df['Distances'] = zip(*agg_df.apply(lambda x:
near_station(Point(x['x'], x['y']), stations_df), axis=1))
plt.figure(figsize=(8, 6))
sns.boxplot(data=agg_df, x='Agglomeration Type', y='Min Distance')
plt.title("Distance to NLE Stations by Agglomeration Type")
plt.ylabel("Distance (meters)")
plt.xlabel("Agglomeration Type")
plt.show()
#######################business density plots####################
epsg=27700
buffers = [500, 750]
for buffer in buffers:
buffer_stations_df = stations_df.copy()
buffer_stations_df = buffer_stations_df.to_crs(epsg)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6
joined = gpd.sjoin(final_gdf_clustered.to_crs(epsg), buffer_stations_df, how='inner', predicate='within')
business_counts = joined.groupby(['index_right', 'year']).size().reset_index(name='business_count')
stations_with_density = buffer_stations_df.join(
business_counts.set_index('index_right')[['business_count', 'year']]).fillna(0).rename(columns = {
'year' : 'Year', 'name' : 'Station Name'})
stations_with_density['Business Density'] = stations_with_density['business_count'] / stations_with_density['buffer_area_km2']
plt.figure(figsize=(10, 6))
sns.lineplot(data=stations_with_density, x='Year', y='Business Density', palette='Set2',
hue = 'Station Name', linewidth=2.5, marker='o')
plt.title(f"Business Density within Stations {buffer}m Buffer")
plt.show()
for buffer in buffers:
buffer_stations_df = stations_df.copy()
buffer_stations_df = buffer_stations_df.to_crs(epsg)
buffer_stations_df['geometry'] = buffer_stations_df.buffer(buffer)
buffer_stations_df['buffer_area_km2'] = buffer_stations_df.geometry.area / 1e6
joined = gpd.sjoin(final_gdf_clustered.to_crs(epsg), buffer_stations_df, how='inner', predicate='within')
joined = joined[joined['scat_category'] != 'Other']
business_counts = joined.groupby(['index_right', 'year', 'scat_category']).size().reset_index(name='business_count')
stations_with_density = buffer_stations_df.join(
business_counts.set_index('index_right')[['business_count', 'scat_category', 'year']]).fillna(0).rename(columns = {
'year' : 'Year', 'name' : 'Station Name', 'scat_category' : 'Sector'})
stations_with_density['Station Name Sector'] = stations_with_density.apply(lambda x: ' - '.join(
[x['Station Name'], x['Sector']]), axis = 1)
stations_with_density['Business Density'] = stations_with_density['business_count'] / stations_with_density['buffer_area_km2']
plt.figure(figsize=(10, 6))
sns.lineplot(data=stations_with_density, x='Year', y='Business Density', palette='Set2',
hue = 'Station Name Sector', linewidth=2.5, marker='o')
plt.title(f"Business Density within Stations {buffer}m Buffer")
plt.show()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[27], line 363 361 ent_thresh = agg_df['entropy_norm'].quantile(0.7) 362 gini_thresh = agg_df['gini_norm'].quantile(0.3) --> 363 agg_df['Agglomeration Type'] = agg_df.apply(lambda x: classify_row(x, ent_thresh=gini_thresh, gini_thresh=ent_thresh, entrop_col = 'entropy_norm', 364 gini_col = 'gini_norm'), axis=1) 366 trend_df = agg_df.groupby(['year', 'Agglomeration Type']).size().reset_index(name='count').set_index(['year', 'Agglomeration Type' 367 ]).join(agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year', 'Agglomeration Type']).sum()[['n_businesses']] 368 ).reset_index() 369 trend_df = agg_df.groupby(['year']).size().reset_index(name='count').set_index(['year']).join( 370 agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year']).sum()[['n_businesses']] 371 ).rename(columns = {'count' : 'Total Count', 'n_businesses': 'Total Businesses'}).join( 372 trend_df.set_index('year')).reset_index() File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pandas\core\frame.py:10034, in DataFrame.apply(self, func, axis, raw, result_type, args, by_row, **kwargs) 10022 from pandas.core.apply import frame_apply 10024 op = frame_apply( 10025 self, 10026 func=func, (...) 10032 kwargs=kwargs, 10033 ) > 10034 return op.apply().__finalize__(self, method="apply") File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pandas\core\apply.py:837, in FrameApply.apply(self) 834 elif self.raw: 835 return self.apply_raw() --> 837 return self.apply_standard() File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pandas\core\apply.py:965, in FrameApply.apply_standard(self) 964 def apply_standard(self): --> 965 results, res_index = self.apply_series_generator() 967 # wrap results 968 return self.wrap_results(results, res_index) File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pandas\core\apply.py:981, in FrameApply.apply_series_generator(self) 978 with option_context("mode.chained_assignment", None): 979 for i, v in enumerate(series_gen): 980 # ignore SettingWithCopy here in case the user mutates --> 981 results[i] = self.func(v, *self.args, **self.kwargs) 982 if isinstance(results[i], ABCSeries): 983 # If we have a view on v, we need to make a copy because 984 # series_generator will swap out the underlying data 985 results[i] = results[i].copy(deep=False) Cell In[27], line 363, in <lambda>(x) 361 ent_thresh = agg_df['entropy_norm'].quantile(0.7) 362 gini_thresh = agg_df['gini_norm'].quantile(0.3) --> 363 agg_df['Agglomeration Type'] = agg_df.apply(lambda x: classify_row(x, ent_thresh=gini_thresh, gini_thresh=ent_thresh, entrop_col = 'entropy_norm', 364 gini_col = 'gini_norm'), axis=1) 366 trend_df = agg_df.groupby(['year', 'Agglomeration Type']).size().reset_index(name='count').set_index(['year', 'Agglomeration Type' 367 ]).join(agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year', 'Agglomeration Type']).sum()[['n_businesses']] 368 ).reset_index() 369 trend_df = agg_df.groupby(['year']).size().reset_index(name='count').set_index(['year']).join( 370 agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year']).sum()[['n_businesses']] 371 ).rename(columns = {'count' : 'Total Count', 'n_businesses': 'Total Businesses'}).join( 372 trend_df.set_index('year')).reset_index() TypeError: classify_row() got an unexpected keyword argument 'gini_thresh'
Plot Agglomeration Types Overtime¶
In [175]:
agg_results = []
for year in sorted(final_gdf_clustered['year'].unique()):
year_df = final_gdf_clustered[final_gdf_clustered['year'] == year]
for cluster_id, group in year_df.groupby('cluster'):
if cluster_id == -1: continue # skip noise
counts = group[group['scat_category'] != 'Other']['scat_category'].value_counts()
H = compute_entropy(counts)
G = compute_gini(counts)
agg_results.append({
'year': year,
'cluster': cluster_id,
'entropy': H,
'gini': G,
'n_businesses': len(group),
'x': group['x'].mean(),
'y': group['y'].mean()
})
agg_df = pd.DataFrame(agg_results)
# Normalize entropy between 0–1
agg_df['entropy_norm'] = agg_df.groupby('year')['entropy'].transform(
lambda x: (x - x.min()) / (x.max() - x.min())
)
agg_df['gini_norm'] = agg_df.groupby('year')['gini'].transform(
lambda x: (x - x.min()) / (x.max() - x.min())
)
######################
agg_df['Agglomeration Type'] = agg_df.apply(lambda x: classify_row(x, agg_df, entrop_col = 'entropy_norm',
mode = 'entropy_quantile'), axis=1)
trend_df = agg_df.groupby(['year', 'Agglomeration Type']).size().reset_index(name='count').set_index(['year', 'Agglomeration Type'
]).join(agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year', 'Agglomeration Type']).sum()[['n_businesses']]
).reset_index()
trend_df = agg_df.groupby(['year']).size().reset_index(name='count').set_index(['year']).join(
agg_df[['year', 'Agglomeration Type', 'n_businesses']].groupby(['year']).sum()[['n_businesses']]
).rename(columns = {'count' : 'Total Count', 'n_businesses': 'Total Businesses'}).join(
trend_df.set_index('year')).reset_index()
trend_df['Cluster Proportions'] = trend_df['count'] / trend_df['Total Count']
trend_df['Businesses Proportion'] = trend_df['n_businesses'] / trend_df['Total Businesses']
plt.figure(figsize=(10, 6))
sns.lineplot(data=trend_df, x='year', y='count', hue='Agglomeration Type', marker='o')
plt.title("Agglomeration Type Counts Over Time")
plt.ylabel("Number of Clusters")
plt.xlabel("Year")
plt.grid(True)
plt.show()
plt.figure(figsize=(10, 6))
sns.lineplot(data=trend_df, x='year', y='n_businesses', hue='Agglomeration Type', marker='o')
plt.title("Agglomeration Type Clusters Total Businesses Over Time")
plt.ylabel("Number of Businesses")
plt.xlabel("Year")
plt.grid(True)
plt.show()
centroid = Point(group['x'].mean(), group['y'].mean())
agg_df['_'], agg_df['Min Distance'], agg_df['Distances'] = zip(*agg_df.apply(lambda x:
near_station(Point(x['x'], x['y']), stations_df), axis=1))
plt.figure(figsize=(8, 6))
sns.boxplot(data=agg_df, x='Agglomeration Type', y='Min Distance')
plt.title("Distance to NLE Stations by Agglomeration Type")
plt.ylabel("Distance (meters)")
plt.xlabel("Agglomeration Type")
plt.show()
In [179]:
import matplotlib.patches as mpatches
for year in years:
fig, ax = plt.subplots(figsize=(8, 5))
tmp_df = final_gdf_clustered[(final_gdf_clustered['year'] == year) & (final_gdf_clustered['cluster'] != -1)].copy()
tmp_agg_df = agg_df[agg_df['year'] == year][['cluster', 'entropy', 'entropy_norm', 'Agglomeration Type']].drop_duplicates().copy()
tmp_dominant_sectors = dominant_sectors[dominant_sectors['year'] == year].rename(columns = {'scat_category' : 'dom_scat_category'}
).drop(columns = ['year', 'proportion'])
tmp_df = tmp_df.set_index(['cluster']).join(tmp_agg_df.set_index('cluster')).join(tmp_dominant_sectors.set_index('cluster')).reset_index()
tmp_df = tmp_df[(tmp_df['scat_category'] != 'Other')]
tmp_df['Agglomeration Type Sector'] = tmp_df.apply(lambda x: x['Agglomeration Type'] if x['Agglomeration Type'] in ['High diversity', 'Moderate diversity']
else ' - '.join([x['Agglomeration Type'], x['scat_category']]), axis = 1)
tmp_df = tmp_df.sort_values(by = 'Agglomeration Type Sector')
if tmp_df.crs != 'EPSG:3857':
tmp_df = tmp_df.to_crs(epsg=3857)
sc = tmp_df.plot(ax=ax, column='Agglomeration Type Sector', cmap=cmap, markersize=5, legend=False, zorder=3)
stations_buffer1.plot(ax=ax, color='none', edgecolor='black', linestyle='--', linewidth=1.2, alpha=0.6, zorder=2)
stations_buffer2.plot(ax=ax, color='none', edgecolor='black', linestyle='--', linewidth=1.2, alpha=0.6, zorder=2)
stations_df_tmp.plot(ax=ax, color='black', marker='o', markersize=50, label='Station', zorder=4)
ctx.add_basemap(ax, source=ctx.providers.CartoDB.PositronNoLabels, crs=tmp_df.crs)
ax.set_title(f'Business Clusters by Agglomeration Type – Year: {year}', fontsize=14)
ax.set_axis_off()
categories = tmp_df['Agglomeration Type Sector'].unique()
colors = [cmap(norm(cat)) if 'norm' in locals() else cmap(i / (len(categories)-1)) for i, cat in enumerate(categories)]
patches = [mpatches.Patch(color=col, label=cat) for col, cat in zip(colors, categories)]
ax.legend(handles=patches, title='Agglomeration Type', loc='upper left', fontsize=9, title_fontsize=10)
plt.tight_layout()
plt.show()